In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import os
import sys, os
sys.path.insert(0, os.path.abspath('..'))
import data_generation.diff_utils
import data_generation.mwdiff.mwdiffs_to_tsv
import numpy as np
In [2]:
df_raw = pd.read_csv("../../data/toxicity_annotations/raw/toxicity_for_ellery.csv")
In [3]:
df = df_raw.copy()
In [4]:
df.shape
Out[4]:
In [5]:
df['query'].value_counts()
Out[5]:
In [6]:
df['ns'] = df['query'].apply(lambda x: x.split('_')[0])
df['sample'] = df['query'].apply(lambda x: x.split('_')[1])
In [7]:
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()
print(df.sample_count.value_counts())
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'
del df.sample_count
print(df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts())
In [8]:
df['toxicity'] = (df['toxicity_score'] < 0).apply(int)
In [9]:
df['toxicity_score'].value_counts(dropna=False)
Out[9]:
In [10]:
df['toxicity'].value_counts(dropna=False)
Out[10]:
In [11]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])
In [12]:
from baselines import remove_na
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])
In [13]:
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])
In [14]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()
Out[14]:
In [15]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])
In [16]:
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])
In [17]:
u_comments = comments.drop_duplicates(subset = ['comment_text'])
print(u_comments.shape[0])
In [18]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])
In [19]:
df['toxicity_score'].value_counts(dropna=False)
Out[19]:
In [20]:
df['toxicity'].value_counts(dropna=False)
Out[20]:
In [21]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index
In [22]:
counts['n'].value_counts().head()
Out[22]:
In [23]:
counts_enough = counts.query("n>=8")
In [24]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])
In [25]:
df_comments = df.drop_duplicates(subset = ['rev_id']).copy()
df_comments['logged_in'] = df_comments['user_id'].notnull()
df_comments['year'] = pd.to_datetime(df_comments['rev_timestamp']).apply(lambda x: x.year)
In [26]:
elements = np.array(["train", "dev", "test"])
probabilities = np.array([0.6, 0.2, 0.2])
df_comments['split'] = np.random.choice(elements, size=df_comments.shape[0], p=list(probabilities))
In [27]:
df_comments['split'].value_counts()
Out[27]:
In [28]:
df_workers = df[['_worker_id']].drop_duplicates()
df_workers['anon_id'] = range(df_workers.shape[0])
df = df.merge(df_workers, how = 'inner', on = '_worker_id')
df.shape
# save worker id mapping
df_workers.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations_worker_id_map.tsv'), sep = '\t', index = False)
In [169]:
# fix legacy special token issues
df_comments['diff'] = df_comments['diff'].apply(data_generation.mwdiff.mwdiffs_to_tsv.replace_special_chars)
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('TAB', 'TAB_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('NEWLINE', 'NEWLINE_TOKEN'))
df_comments['diff'] = df_comments['diff'].apply(lambda x: x.replace('"', '`'))
# apply latest version of clean and filter
df_comments = data_generation.diff_utils.clean_and_filter(df_comments)
# clean and filter drops some comments, so drop associated labels
df = df.merge(df_comments[['rev_id']], how = 'inner', on = 'rev_id' )
In [170]:
# rename some columns
df_comments = df_comments.rename(columns={
'clean_diff': 'comment',
'rev_timestamp': 'timestamp',
})
order = ['rev_id', 'comment', 'year', 'logged_in', 'ns', 'sample', 'split']
df_comments = df_comments[order]
df_comments = df_comments.sort_values('rev_id')
df_comments.shape
Out[170]:
In [171]:
# get set of human labels
df_toxicity_labels = df[['rev_id', 'anon_id', 'toxicity', 'toxicity_score']]
df_toxicity_labels = df_toxicity_labels.rename(columns={
'anon_id': 'worker_id',
})
df_toxicity_labels = df_toxicity_labels.sort_values('rev_id')
In [172]:
# save dfs
df_comments.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t', index = False)
df_toxicity_labels.to_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t', index = False)
In [173]:
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotated_comments.tsv'), sep = '\t').shape
Out[173]:
In [174]:
pd.read_csv(os.path.join( "../../data/figshare", 'toxicity_annotations.tsv'), sep = '\t').drop_duplicates(subset = 'rev_id').shape
Out[174]:
In [176]:
df_comments.head()
Out[176]:
In [ ]: